# load libraries
#library(plyr)
library(tidyverse)
library(rlist)
library(broom)
library(sandwich)
library(lmtest)
library(lme4)
library(xtable)
library(reshape)
library(reshape2)
library(broom.mixed)
library(multiwayvcov)
library(stargazer)

# the working directory -> adjust per your computer
path.name <- "YOUR PATH HERE"
setwd(path.name)


################ GATHER, MERGE, AND RECODE DATA FROM THE WORLD ATLAS OF LANGUAGE STRUCTURES ################

setwd(paste0(path.name, "/data/wals data"))

# get all the .csv files and read them in
to_read <- list.files(pattern="*.csv")
imported <- lapply(to_read, function(x) read.csv(x))

# gathers column names
vars <- as.character(unlist(lapply(seq(imported), function(x) imported[[x]][1,3])))
heads <- c("Parameter", "Language", "Frequency", "Confidence", "References")
cols <- lapply(seq(imported), function(x) c("ID",vars[x],heads))

# rename and subset to prep for merging
to_merge <- lapply(seq(imported), function(x) {
                  names(imported[[x]]) <- cols[[x]]
                  imported[[x]]$ID <-  gsub("^.*?-","",imported[[x]]$ID)
                  return(data.frame(imported[[x]][1], imported[[x]][4], imported[[x]][2]))
})

# merge and clean up
wals_data <- Reduce(function(x, y) merge(x, y, all.y=TRUE, all.x=TRUE, id="ID"), to_merge)
rm(to_merge,imported,cols,to_read,heads,vars)

# recode the WALS variables to binary variables
wals_data$M.T.Pronouns <- recode(wals_data$M.T.Pronouns,
                      `No M-T pronouns` = "No M-T pronouns", .default = "Exists")
wals_data$Alignment.of.Verbal.Person.Marking <- recode(wals_data$Alignment.of.Verbal.Person.Marking,
      `Accusative` = "Accusative", .default = "Other")
wals_data$Expression.of.Pronominal.Subjects <- recode(wals_data$Expression.of.Pronominal.Subjects,
      `Subject affixes on verb` = "Subject affixes on verb", .default = "Other")
wals_data$Verbal.Person.Marking <- recode(wals_data$Verbal.Person.Marking,
      `No person marking` = "No person marking", .default = "Other")
wals_data$Order.of.Person.Markers.on.the.Verb <- recode(wals_data$Order.of.Person.Markers.on.the.Verb,
       `A and P do not or do not both occur on the verb` = "A and P do not or do not both occur on the verb", .default = "Other")
wals_data$Ditransitive.Constructions..The.Verb..Give. <- recode(wals_data$Ditransitive.Constructions..The.Verb..Give.,
       `Indirect-object construction` = "Indirect-object construction", .default = "Other")
wals_data$Reciprocal.Constructions <- recode(wals_data$Reciprocal.Constructions,
                                        `Distinct from reflexive` = "Distinct from reflexive", .default = "Other")
wals_data$Passive.Constructions <- recode(wals_data$Passive.Constructions,
                                        `Present` = "Present", .default = "Absent")
wals_data$Applicative.Constructions <- recode(wals_data$Applicative.Constructions,
                                    `No applicative construction` = "No applicative construction", .default = "Applicative")
wals_data$Antipassive.Constructions <- recode(wals_data$Antipassive.Constructions,
                                              `No antipassive` = "No antipassive", .default = "Other")
wals_data$Productivity.of.the.Antipassive.Construction <- recode(wals_data$Productivity.of.the.Antipassive.Construction,
                                              `no antipassive` = "No antipassive", .default = "Other")
wals_data$Periphrastic.Causative.Constructions <- recode(wals_data$Periphrastic.Causative.Constructions,
                                         `Purposive but no sequential` = "Purposive but no sequential", .default = "Other")
wals_data$Nonperiphrastic.Causative.Constructions <- recode(wals_data$Nonperiphrastic.Causative.Constructions,
                                          `Morphological but no compound` = "Morphological but no compound", .default = "Other")
wals_data$Negative.Morphemes <- recode(wals_data$Negative.Morphemes,
                                          `Negative affix` = "Negative affix", .default = "Other")
wals_data$Symmetric.and.Asymmetric.Standard.Negation <- recode(wals_data$Symmetric.and.Asymmetric.Standard.Negation,
                                  `Symmetric` = "Symmetric", .default = "Other")
wals_data$Negative.Indefinite.Pronouns.and.Predicate.Negation <- recode(wals_data$Negative.Indefinite.Pronouns.and.Predicate.Negation,
                                  `Predicate negation also present` = "Predicate negation also present", .default = "Other")
wals_data$Polar.Questions <- recode(wals_data$Polar.Questions,
                                  `Question particle` = "Question particle", .default = "Other")
wals_data$Predicative.Possession <- recode(wals_data$Predicative.Possession,
                               `'Have'` = "'Have'", .default = "Other")
wals_data$Predicative.Adjectives <- recode(wals_data$Predicative.Adjectives,
                                      `Nonverbal encoding` = "Nonverbal encoding", .default = "Other")
wals_data$Comparative.Constructions <- recode(wals_data$Comparative.Constructions,
                                    `Particle` = "Particle", .default = "Other")
wals_data$Relativization.on.Subjects <- recode(wals_data$Relativization.on.Subjects,
                                         `Relative pronoun` = "Relative pronoun", .default = "Other")
wals_data$Relativization.on.Obliques <- recode(wals_data$Relativization.on.Obliques,
                                          `Relative pronoun` = "Relative pronoun", .default = "Other")
wals_data$X.Want..Complement.Subjects <- recode(wals_data$X.Want..Complement.Subjects,
                                          `Subject is left implicit` = "Subject is left implicit", .default = "Other")
wals_data$Purpose.Clauses <- recode(wals_data$Purpose.Clauses,
                                           `Deranked` = "Deranked", .default = "Other")
wals_data$X.When..Clauses <- recode(wals_data$X.When..Clauses,
                               `Deranked` = "Deranked", .default = "Other")
wals_data$Reason.Clauses <- recode(wals_data$Reason.Clauses,
                               `Balanced` = "Balanced", .default = "Other")
wals_data$Utterance.Complement.Clauses <- recode(wals_data$Utterance.Complement.Clauses,
                              `Balanced` = "Balanced", .default = "Other")
wals_data$Syllable.Structure <- recode(wals_data$Syllable.Structure,
                                            `Balanced` = "Complex", .default = "Other")
wals_data$Numeral.Bases <- recode(wals_data$Numeral.Bases,
                                  `Decimal` = "Decimal", .default = "Other")
wals_data$Number.of.Basic.Colour.Categories <- recode(wals_data$Number.of.Basic.Colour.Categories,
                             `11` = "11", .default = "Other")
wals_data$Tea <- recode(wals_data$Tea,
                      `Words derived from Min Nan Chinese te` = "Words derived from Min Nan Chinese te", .default = "Other")
wals_data$Tone <- recode(wals_data$Tone,
                             `No tones` = "No Tones", .default = "Other")
wals_data$Para.Linguistic.Usages.of.Clicks <- recode(wals_data$Para.Linguistic.Usages.of.Clicks,
                    `Affective meanings` = "Affective meanings", .default = "Other")
wals_data$Order.of.Negative.Morpheme.and.Verb <- recode(wals_data$Order.of.Negative.Morpheme.and.Verb,
                                                `[V-Neg]` = "[V-Neg]", .default = "Other")
wals_data$Preverbal.Negative.Morphemes <- recode(wals_data$Preverbal.Negative.Morphemes,
                                                   `NegV` = "NegV", .default = "Other")
wals_data$Postverbal.Negative.Morphemes <- recode(wals_data$Postverbal.Negative.Morphemes,
                                            `None` = "None", .default = "Other")
wals_data$Fixed.Stress.Locations <- recode(wals_data$Fixed.Stress.Locations,
                                             `No fixed stress` = "No fixed stress", .default = "Other")
wals_data$Weight.Sensitive.Stress <- recode(wals_data$Weight.Sensitive.Stress,
                    `Fixed stress (no weight-sensitivity)` = "Fixed stress (no weight-sensitivity)", .default = "Other")
wals_data$Weight.Factors.in.Weight.Sensitive.Stress.Systems <- recode(wals_data$Weight.Factors.in.Weight.Sensitive.Stress.Systems,
                                       `Combined` = "Combined", .default = "Other")
wals_data$Rhythm.Types <- recode(wals_data$Rhythm.Types,
                                      `Trochaic` = "Trochaic", .default = "Other")
wals_data$Absence.of.Common.Consonants <- recode(wals_data$Absence.of.Common.Consonants,
                                 `All present` = "All present", .default = "Other")
wals_data$Presence.of.Uncommon.Consonants <- recode(wals_data$Presence.of.Uncommon.Consonants,
                            `'Th' sounds` = "'Th' sounds", .default = "Other")
wals_data$Consonant.Inventories <- recode(wals_data$Consonant.Inventories,
                              `Large` = "Moderately Large or Large",
                              `Moderately large` = "Moderately Large or Large", .default = "Other")
wals_data$Fusion.of.Selected.Inflectional.Formatives <- recode(wals_data$Fusion.of.Selected.Inflectional.Formatives,
                              `Exclusively concatenative` = "Exclusively concatenative", .default = "Other")
wals_data$Exponence.of.Selected.Inflectional.Formatives <- recode(wals_data$Exponence.of.Selected.Inflectional.Formatives,
                          `Monoexponential case` = "Monoexponential case", .default = "Other")
wals_data$Exponence.of.Tense.Aspect.Mood.Inflection <- recode(wals_data$Exponence.of.Tense.Aspect.Mood.Inflection,
                          `monoexponential TAM` = "monoexponential TAM", .default = "Other")
wals_data$Inflectional.Synthesis.of.the.Verb <- recode(wals_data$Inflectional.Synthesis.of.the.Verb,
                         `0-1 category per word` = "0-1 category per word", .default = "Other")
wals_data$Locus.of.Marking.in.the.Clause <- recode(wals_data$Locus.of.Marking.in.the.Clause,
                                                  `Dependent marking` = "Dependent marking", .default = "Other")
wals_data$Locus.of.Marking.in.Possessive.Noun.Phrases <- recode(wals_data$Locus.of.Marking.in.Possessive.Noun.Phrases,
                                              `Dependent marking` = "Dependent marking", .default = "Other")
wals_data$Locus.of.Marking..Whole.language.Typology <- recode(wals_data$Locus.of.Marking..Whole.language.Typology,
                                  `Dependent-marking` = "Dependent-marking", .default = "Other")
wals_data$Prefixing.vs..Suffixing.in.Inflectional.Morphology <- recode(wals_data$Prefixing.vs..Suffixing.in.Inflectional.Morphology,
                                `Strongly suffixing` = "Strongly suffixing", .default = "Other")
wals_data$Reduplication <- recode(wals_data$Reduplication,
                `No productive reduplication` = "No productive reduplication", .default = "Other")
wals_data$Case.Syncretism <- recode(wals_data$Case.Syncretism,
                             `Core and non-core` = "Core and non-core", .default = "Other")
wals_data$Syncretism.in.Verbal.Person.Number.Marking <- recode(wals_data$Syncretism.in.Verbal.Person.Number.Marking,
                               `Syncretic` = "Syncretic", .default = "Other")
wals_data$Vowel.Quality.Inventories <- recode(wals_data$Vowel.Quality.Inventories,
                        `Large (7-14)` = "Large (7-14)", .default = "Other")
wals_data$Number.of.Genders <- recode(wals_data$Number.of.Genders,
                                         `None` = "None", .default = "Other")
wals_data$Sex.based.and.Non.sex.based.Gender.Systems <- recode(wals_data$Sex.based.and.Non.sex.based.Gender.Systems,
                                 `Sex-based` = "Sex-based", .default = "Other")
wals_data$Systems.of.Gender.Assignment <- recode(wals_data$Systems.of.Gender.Assignment,
                                 `Semantic and formal` = "Semantic and formal", .default = "Other")
wals_data$Coding.of.Nominal.Plurality <- recode(wals_data$Coding.of.Nominal.Plurality,
                                            `Plural suffix` = "Plural suffix", .default = "Other")
wals_data$Occurrence.of.Nominal.Plurality <- recode(wals_data$Occurrence.of.Nominal.Plurality,
                          `Only human nouns, optional` = "Only human nouns, optional", .default = "Other")
wals_data$Plurality.in.Independent.Personal.Pronouns <- recode(wals_data$Plurality.in.Independent.Personal.Pronouns,
                          `Person-number stem` = "Person-number stem", .default = "Other")
wals_data$The.Associative.Plural <- recode(wals_data$The.Associative.Plural,
                          `No associative plural` = "No associative plural", .default = "Other")
wals_data$Definite.Articles <- recode(wals_data$Definite.Articles,
                `Definite word distinct from demonstrative` = "Definite word distinct from demonstrative", .default = "Other")
wals_data$Indefinite.Articles <- recode(wals_data$Indefinite.Articles,
                  `Indefinite word distinct from 'one'` = "Indefinite word distinct from 'one'", .default = "Other")
wals_data$Inclusive.Exclusive.Distinction.in.Independent.Pronouns <- recode(wals_data$Inclusive.Exclusive.Distinction.in.Independent.Pronouns,
          `Inclusive/exclusive` = "Inclusive/exclusive", .default = "Other")
wals_data$Consonant.Vowel.Ratio <- recode(wals_data$Consonant.Vowel.Ratio,
                                     `Moderately high` = "Moderately High or High",
                                     `High` = "Moderately High or High", .default = "Other")
wals_data$Inclusive.Exclusive.Distinction.in.Verbal.Inflection <- recode(wals_data$Inclusive.Exclusive.Distinction.in.Verbal.Inflection,
                      `'We' the same as 'I'` = "'We' the same as 'I'", .default = "Other")
wals_data$Distance.Contrasts.in.Demonstratives <- recode(wals_data$Distance.Contrasts.in.Demonstratives,
                      `No distance contrast` = "No distance contrast", .default = "Other")
wals_data$Pronominal.and.Adnominal.Demonstratives <- recode(wals_data$Pronominal.and.Adnominal.Demonstratives,
                      `Identical` = "Identical", .default = "Other")
wals_data$Third.Person.Pronouns.and.Demonstratives <- recode(wals_data$Third.Person.Pronouns.and.Demonstratives,
                        `Unrelated` = "Unrelated", .default = "Other")
wals_data$Gender.Distinctions.in.Independent.Personal.Pronouns <- recode(wals_data$Gender.Distinctions.in.Independent.Personal.Pronouns,
                        `3rd person singular only` = "3rd person singular only", .default = "Other")
wals_data$Politeness.Distinctions.in.Pronouns <- recode(wals_data$Politeness.Distinctions.in.Pronouns,
                        `No politeness distinction` = "No politeness distinction", .default = "Exists")
wals_data$Indefinite.Pronouns <- recode(wals_data$Indefinite.Pronouns,
                        `Generic-noun-based` = "Generic-noun-based", .default = "Other")
wals_data$Person.Marking.on.Adpositions <- recode(wals_data$Person.Marking.on.Adpositions,
                                   `No person marking` = "No person marking", .default = "Other")
wals_data$Number.of.Cases <- recode(wals_data$Number.of.Cases,
                      `No morphological case-marking` = "No morphological case-marking", .default = "Other")
wals_data$Voicing.in.Plosives.and.Fricatives <- recode(wals_data$Voicing.in.Plosives.and.Fricatives,
                      `In both plosives and fricatives` = "In both plosives and fricatives", .default = "Other")
wals_data$Asymmetrical.Case.Marking <- recode(wals_data$Asymmetrical.Case.Marking,
                        `Additive-quantitatively asymmetrical` = "Additive-quantitatively asymmetrical", .default = "Other")
wals_data$Position.of.Case.Affixes <- recode(wals_data$Position.of.Case.Affixes,
                         `No case affixes or adpositional clitics` = "No case affixes or adpositional clitics", .default = "Other")
wals_data$Comitatives.and.Instrumentals <- recode(wals_data$Comitatives.and.Instrumentals,
                        `Differentiation` = "Differentiation", .default = "Other")
wals_data$Ordinal.Numerals <- recode(wals_data$Ordinal.Numerals,
                        `First, second, three-th` = "First, second, three-th", .default = "Other")
wals_data$Distributive.Numerals <- recode(wals_data$Distributive.Numerals,
                                `No distributive numerals` = "No distributive numerals", .default = "Other")
wals_data$Numeral.Classifiers <- recode(wals_data$Numeral.Classifiers,
                                     `Absent` = "Absent", .default = "Exists")
wals_data$Conjunctions.and.Universal.Quantifiers <- recode(wals_data$Conjunctions.and.Universal.Quantifiers,
                                   `Formally different` = "Formally different", .default = "Other")
wals_data$Position.of.Pronominal.Possessive.Affixes <- recode(wals_data$Position.of.Pronominal.Possessive.Affixes,
                                 `No possessive affixes` = "No possessive affixes", .default = "Exists")
wals_data$Possessive.Classification <- recode(wals_data$Possessive.Classification,
                                           `No possessive classification` = "No possessive classification", .default = "Exists")
wals_data$Voicing.and.Gaps.in.Plosive.Systems <- recode(wals_data$Voicing.and.Gaps.in.Plosive.Systems,
                      `None missing in /p t k b d g/` = "None missing in /p t k b d g/", .default = "Other")
wals_data$Genitives..Adjectives.and.Relative.Clauses <- recode(wals_data$Genitives..Adjectives.and.Relative.Clauses,
                    `Highly differentiated` = "Highly differentiated", .default = "Other")
wals_data$Adjectives.without.Nouns <- recode(wals_data$Adjectives.without.Nouns,
                    `Without marking` = "Without marking", .default = "Other")
wals_data$Action.Nominal.Constructions <- recode(wals_data$Action.Nominal.Constructions,
                    `Ergative-Possessive` = "Ergative-Possessive", .default = "Other")
wals_data$Noun.Phrase.Conjunction <- recode(wals_data$Noun.Phrase.Conjunction,
                    `'And' identical to 'with'` = "'And' identical to 'with'", .default = "Other")
wals_data$Nominal.and.Verbal.Conjunction <- recode(wals_data$Nominal.and.Verbal.Conjunction,
                    `Differentiation` = "Differentiation", .default = "Other")
wals_data$The.Past.Tense <- recode(wals_data$The.Past.Tense,
                                `No past tense` = "No past tense", .default = "Exists")
wals_data$The.Perfect <- recode(wals_data$The.Perfect,
                              `No perfect` = "No perfect", .default = "Other")
wals_data$Position.of.Tense.Aspect.Affixes <- recode(wals_data$Position.of.Tense.Aspect.Affixes,
                           `Tense-aspect suffixes` = "Tense-aspect suffixes", .default = "Other")
wals_data$Uvular.Consonants <- recode(wals_data$Uvular.Consonants,
                          `None` = "None", .default = "Exists")
wals_data$The.Morphological.Imperative <- recode(wals_data$The.Morphological.Imperative,
                        `No second-person imperatives` = "No second-person imperatives", .default = "Exists")
wals_data$The.Prohibitive <- recode(wals_data$The.Prohibitive,
                      `Normal imperative + normal negative` = "Normal imperative + normal negative", .default = "Other")
wals_data$Imperative.Hortative.Systems <- recode(wals_data$Imperative.Hortative.Systems,
                      `Neither type of system` = "Neither type of system", .default = "Some System")
wals_data$Situational.Possibility <- recode(wals_data$Situational.Possibility,
                      `Verbal constructions` = "Verbal constructions", .default = "Other")
wals_data$Epistemic.Possibility <- recode(wals_data$Epistemic.Possibility,
                      `Verbal constructions` = "Verbal constructions", .default = "Other")
wals_data$Overlap.between.Situational.and.Epistemic.Modal.Marking <- recode(wals_data$Overlap.between.Situational.and.Epistemic.Modal.Marking,
                      `Overlap for both possibility and necessity` = "Overlap for both possibility and necessity", .default = "Other")
wals_data$Semantic.Distinctions.of.Evidentiality <- recode(wals_data$Semantic.Distinctions.of.Evidentiality,
                      `No grammatical evidentials` = "No grammatical evidentials", .default = "Exists")
wals_data$Coding.of.Evidentiality <- recode(wals_data$Coding.of.Evidentiality,
                      `Modal morpheme` = "Modal morpheme", .default = "Other")
wals_data$Suppletion.According.to.Tense.and.Aspect <- recode(wals_data$Suppletion.According.to.Tense.and.Aspect,
                                       `Tense and aspect` = "Tense and aspect", .default = "Other")
wals_data$Suppletion.in.Imperatives.and.Hortatives <- recode(wals_data$Suppletion.in.Imperatives.and.Hortatives,
                      `Imperative` = "Imperative", .default = "Other")
wals_data$Glottalized.Consonants <- recode(wals_data$Glottalized.Consonants,
                      `No glottalized consonants` = "No glottalized consonants", .default = "Exist")
wals_data$Verbal.Number.and.Suppletion <- recode(wals_data$Verbal.Number.and.Suppletion,
                                      `None` = "None", .default = "Exist")
wals_data$Order.of.Subject..Object.and.Verb <- recode(wals_data$Order.of.Subject..Object.and.Verb,
                                          `SVO` = "SVO", .default = "Other")
wals_data$Order.of.Subject.and.Verb <- recode(wals_data$Order.of.Subject.and.Verb,
                                          `SV` = "SV", .default = "Other")
wals_data$Order.of.Object.and.Verb <- recode(wals_data$Order.of.Object.and.Verb,
                                         `VO` = "VO", .default = "Other")
wals_data$Order.of.Object..Oblique..and.Verb <- recode(wals_data$Order.of.Object..Oblique..and.Verb,
                                        `VOX` = "VOX", .default = "Other")
wals_data$Order.of.Adposition.and.Noun.Phrase <- recode(wals_data$Order.of.Adposition.and.Noun.Phrase,
                                                  `Postpositions` = "Postpositions", .default = "Other")
wals_data$Order.of.Genitive.and.Noun <- recode(wals_data$Order.of.Genitive.and.Noun,
                                  `Noun-Genitive` = "Noun-Genitive", .default = "Other")
wals_data$Order.of.Adjective.and.Noun <- recode(wals_data$Order.of.Adjective.and.Noun,
                                  `Adjective-Noun` = "Adjective-Noun", .default = "Other")
wals_data$Order.of.Demonstrative.and.Noun <- recode(wals_data$Order.of.Demonstrative.and.Noun,
                                    `Demonstrative-Noun` = "Demonstrative-Noun", .default = "Other")
wals_data$Order.of.Numeral.and.Noun <- recode(wals_data$Order.of.Numeral.and.Noun,
                                               `Numeral-Noun` = "Numeral-Noun", .default = "Other")
wals_data$Lateral.Consonants <- recode(wals_data$Lateral.Consonants,
                                         `/l/, no obstruent laterals` = "/l/, no obstruent laterals", .default = "Other")
wals_data$Order.of.Relative.Clause.and.Noun <- recode(wals_data$Order.of.Relative.Clause.and.Noun,
                                  `Noun-Relative clause` = "Noun-Relative clause", .default = "Other")
wals_data$Order.of.Degree.Word.and.Adjective <- recode(wals_data$Order.of.Degree.Word.and.Adjective,
                                  `Degree word-Adjective` = "Degree word-Adjective", .default = "Other")
wals_data$Position.of.Polar.Question.Particles <- recode(wals_data$Position.of.Polar.Question.Particles,
                                `No question particle` = "No question particle", .default = "Exists")
wals_data$Position.of.Interrogative.Phrases.in.Content.Questions <- recode(wals_data$Position.of.Interrogative.Phrases.in.Content.Questions,
                              `Initial interrogative phrase` = "Initial interrogative phrase", .default = "Other")
wals_data$Order.of.Adverbial.Subordinator.and.Clause <- recode(wals_data$Order.of.Adverbial.Subordinator.and.Clause,
                              `Initial subordinator word` = "Initial subordinator word", .default = "Other")
wals_data$Relationship.between.the.Order.of.Object.and.Verb.and.the.Order.of.Adposition.and.Noun.Phrase <- recode(wals_data$Relationship.between.the.Order.of.Object.and.Verb.and.the.Order.of.Adposition.and.Noun.Phrase,
                              `VO and Prepositions` = "VO and Prepositions", .default = "Other")
wals_data$Relationship.between.the.Order.of.Object.and.Verb.and.the.Order.of.Relative.Clause.and.Noun <- recode(wals_data$Relationship.between.the.Order.of.Object.and.Verb.and.the.Order.of.Relative.Clause.and.Noun,
                         `VO and NRel` = "VO and NRel", .default = "Other")
wals_data$Relationship.between.the.Order.of.Object.and.Verb.and.the.Order.of.Adjective.and.Noun <- recode(wals_data$Relationship.between.the.Order.of.Object.and.Verb.and.the.Order.of.Adjective.and.Noun,
                        `VO and NAdj` = "VO and NAdj", .default = "Other")
wals_data$Alignment.of.Case.Marking.of.Full.Noun.Phrases <- recode(wals_data$Alignment.of.Case.Marking.of.Full.Noun.Phrases,
                        `Nominative - accusative (standard)` = "Nominative - accusative (standard)", .default = "Other")
wals_data$Alignment.of.Case.Marking.of.Pronouns <- recode(wals_data$Alignment.of.Case.Marking.of.Pronouns,
                        `Nominative - accusative (standard)` = "Nominative - accusative (standard)", .default = "Other")
wals_data$The.Velar.Nasal <- recode(wals_data$The.Velar.Nasal,
                      `No velar nasal` = "No velar nasal", .default = "Exists")




################ RECODE DATA FROM THE WORLD VALUES SURVEY ################

# read in WVS
setwd(paste0(path.name, "/data"))

na_strings <- c("Don´t know", "Missing; Unknown", "No answer", "Not applicable", "Not asked in survey")
wvs_data <- read.csv("forR.csv", na.strings=na_strings)

# first, recode the 25 values variables

# recode importance of things
#wvs_data <- wvs_data %>% 
#  mutate_at(c("Important.in.life..Family","Important.in.life..Friends","Important.in.life..Leisure.time",
#              "Important.in.life..Politics","Important.in.life..Work","Important.in.life..Religion"), 
#            funs(recode(., `Not at all important` = 1,
#                        `Not very important` = 2, 
#                        `Rather important` = 3, 
#                        `Very important` = 4)))

wvs_data$Important.in.life..Family <- recode(wvs_data$Important.in.life..Family,
                                             `Not at all important` = 1,
                                             `Not very important` = 2, 
                                             `Rather important` = 3, 
                                             `Very important` = 4)

wvs_data$Important.in.life..Friends <- recode(wvs_data$Important.in.life..Friends,
                                             `Not at all important` = 1,
                                             `Not very important` = 2, 
                                             `Rather important` = 3, 
                                             `Very important` = 4)

wvs_data$Important.in.life..Leisure.time <- recode(wvs_data$Important.in.life..Leisure.time,
                                              `Not at all important` = 1,
                                              `Not very important` = 2, 
                                              `Rather important` = 3, 
                                              `Very important` = 4)

wvs_data$Important.in.life..Politics <- recode(wvs_data$Important.in.life..Politics,
                                                   `Not at all important` = 1,
                                                   `Not very important` = 2, 
                                                   `Rather important` = 3, 
                                                   `Very important` = 4)

wvs_data$Important.in.life..Work <- recode(wvs_data$Important.in.life..Work,
                                               `Not at all important` = 1,
                                               `Not very important` = 2, 
                                               `Rather important` = 3, 
                                               `Very important` = 4)

wvs_data$Important.in.life..Religion <- recode(wvs_data$Important.in.life..Religion,
                                           `Not at all important` = 1,
                                           `Not very important` = 2, 
                                           `Rather important` = 3, 
                                           `Very important` = 4)

wvs_data$Interest.in.politics <- recode(wvs_data$Interest.in.politics,
                                   `Not at all interested` = 1,
                                   `Not very interested` = 2, 
                                   `Somewhat interested` = 3, 
                                   `Very interested` = 4)
wvs_data$Most.people.can.be.trusted <- recode(wvs_data$Most.people.can.be.trusted,
                                   `Can´t be too careful` = 0,
                                   `Most people can be trusted` = 1)
wvs_data$How.much.freedom.of.choice.and.control <- recode(wvs_data$How.much.freedom.of.choice.and.control,
                                   `None at all` = 1, `2`=2, `3`=3, `4`=4, `5`=5, `6`=6, `7`=7,
                                   `8`=8, `9`=9, `A great deal` = 10)
wvs_data$Jobs.scarce..Employers.should.give.priority.to..nation..people.than.immigrants <- 
  recode(wvs_data$Jobs.scarce..Employers.should.give.priority.to..nation..people.than.immigrants,
                                                      `Disagree` = 0, 
                                                      `Neither` = 1, 
                                                      `Agree` = 2)
wvs_data$Jobs.scarce..Men.should.have.more.right.to.a.job.than.women <- 
  recode(wvs_data$Jobs.scarce..Men.should.have.more.right.to.a.job.than.women,
         `Disagree` = 0, 
         `Neither` = 1, 
         `Agree` = 2)
wvs_data$Woman.as.a.single.parent <- recode(wvs_data$Woman.as.a.single.parent,
         `Disapprove` = 0, 
         `Depends` = 1, 
         `Approve` = 2)
wvs_data$Men.make.better.political.leaders.than.women.do <- recode(wvs_data$Men.make.better.political.leaders.than.women.do,
                                       `Strongly disagree` = 0, 
                                       `Disagree` = 1, 
                                       `Agree` = 2,
                                       `Agree strongly` = 3)
wvs_data$Aims.of.country..first.choice <- recode(wvs_data$Aims.of.country..first.choice,
                                           `A high level of economic growth` = 1, .default = 0)
wvs_data$Most.important..first.choice <- recode(wvs_data$Most.important..first.choice,
                               `A stable economy` = 1, .default = 0)
wvs_data$Willingness.to.fight.for.country <- recode(wvs_data$Willingness.to.fight.for.country,
                                           `Yes` = 1, .default = 0)
wvs_data$Self.positioning.in.political.scale <- recode(wvs_data$Self.positioning.in.political.scale,
                                                     `Left` = 1, `2`=2, `3`=3, `4`=4, `5`=5, `6`=6, `7`=7,
                                                     `8`=8, `9`=9, `Right` = 10)
wvs_data$Income.equality <- recode(wvs_data$Income.equality,
                                                  `Incomes should be made more equal` = 1, 
                                                  `2`=2, `3`=3, `4`=4, `5`=5, `6`=6, `7`=7, `8`=8, `9`=9, 
                                                  `We need larger income differences as incentives` = 10)
wvs_data$Private.vs.state.ownership.of.business <- recode(wvs_data$Private.vs.state.ownership.of.business,
                              `Private ownership of business should be increased` = 1, 
                              `2`=2, `3`=3, `4`=4, `5`=5, `6`=6, `7`=7, `8`=8, `9`=9, 
                              `Government ownership of business should be increased` = 10)
wvs_data$Government.responsibility <- recode(wvs_data$Government.responsibility,
                                                     `People should take more responsibility` = 1, 
                                                     `2`=2, `3`=3, `4`=4, `5`=5, `6`=6, `7`=7, `8`=8, `9`=9, 
                                                     `The government should take more responsibility` = 10)
wvs_data$Political.system..Having.a.strong.leader <- recode(wvs_data$Political.system..Having.a.strong.leader,
                                                              `Very bad` = 0, 
                                                              `Bad` = 1, 
                                                              `Fairly good` = 2,
                                                              `Very good` = 3)
wvs_data$Political.system..Having.a.democratic.political.system <- recode(wvs_data$Political.system..Having.a.democratic.political.system,
                                                       `Very bad` = 0, 
                                                       `Fairly bad` = 1, 
                                                       `Fairly good` = 2,
                                                       `Very good` = 3)
wvs_data$How.proud.of.nationality <- recode(wvs_data$How.proud.of.nationality,
                                                         `Not at all proud` = 0, 
                                                         `Not very proud` = 1,
                                                         `Quite proud` = 2,
                                                         `Very proud` = 3)
wvs_data$Religious.person <- recode(wvs_data$Religious.person,
                                       `A convinced atheist` = 0, 
                                       `Not a religious person` = 1,
                                       `A religious person` = 2)
wvs_data$Feeling.of.happiness <- recode(wvs_data$Feeling.of.happiness,
                                       `Not at all happy` = 0, 
                                       `Not very happy` = 1,
                                       `Quite happy` = 2,
                                       `Very happy` = 3)



# now recode the four behavioral outcomes
wvs_data$How.often.do.you.attend.religious.services <- recode(wvs_data$How.often.do.you.attend.religious.services,
                                                         `Never practically never` = 0, 
                                                         `Less often` = 1, 
                                                         `Once a year` = 2,
                                                         `Only on special holy days/Christmas/Easter days` = 3,
                                                         `Other specific holy days` = 3,
                                                         `Once a month` = 4,
                                                         `Once a week` = 5,
                                                         `More than once a week` = 6)
wvs_data$Family.savings.during.past.year <- recode(wvs_data$Family.savings.during.past.year,
                                                         `Just get by` = 0, 
                                                         `Spent savings and borrowed money` = 1, 
                                                         `Spent some savings and borrowed money` = 1,
                                                         `Save money` = 2)
wvs_data$How.many.children.do.you.have <- recode(wvs_data$How.many.children.do.you.have,
                                                         `No child` = 0, 
                                                         `1 child` = 1, 
                                                         `2 children` = 2,
                                                         `3 children` = 3,
                                                         `4 children` = 4,
                                                         `5 children` = 5,
                                                         `6 children` = 6,
                                                         `7 children` = 7,
                                                          `8` = 8)
wvs_data$Political.action..signing.a.petition <- recode(wvs_data$Political.action..signing.a.petition,
                                              `Would never do` = 0, 
                                              `Might do` = 1,
                                              `Have done` = 2)




################ MERGE WVS AND WALS AND PREP FOR ANALYSIS ################

# drop observations where interview language is "NA"
wvs_interview <- wvs_data[complete.cases(wvs_data[ , 8]),]
wvs_home <- wvs_data[complete.cases(wvs_data[ , 43]),]

# merge data
xwalk <- read.csv("language crosswalk.csv", na.strings=c("","NA"))[,1:3]
names(xwalk) <- c("Language.in.which.interview.was.conducted","Language.at.home","Language")

match1 <- merge(wvs_interview,xwalk,by="Language.in.which.interview.was.conducted", all.x=TRUE)
match2 <- merge(wvs_home,xwalk,by="Language.at.home", all.x=TRUE)

matched_wals_wvs_interview <- merge(match1, wals_data, all.x=TRUE, by="Language")
matched_wals_wvs_home <- merge(match2, wals_data, all.x=TRUE, by="Language")


# select only variables that vary across respondents in the sample
wals_col_names <- names(wals_data[3:ncol(wals_data)])
dimensions <- lapply(wals_col_names, function(x) {  
  v1 <- table(matched_wals_wvs_interview[,x])[1]
  v2 <- table(matched_wals_wvs_interview[,x])[2]
  v3 <- length(table(matched_wals_wvs_interview[,x]))
  return(c(v1,v2,v3))
}
)
to_bind <- t(matrix(unlist(dimensions),nrow=3))
indexed <- cbind(wals_col_names,to_bind)
toselect <- indexed[indexed[,4]==2 & indexed[,2]!=0 & indexed[,3]!=0, ]
xs <- split(toselect[,1], ceiling(seq_along(toselect[,1])/5))


# a vector of 25 possible values DVs
possible_dvs <- c("Important.in.life..Family","Important.in.life..Friends","Important.in.life..Leisure.time",
                  "Important.in.life..Politics","Important.in.life..Work","Important.in.life..Religion","Interest.in.politics",
                  "Most.people.can.be.trusted","How.much.freedom.of.choice.and.control",
                  "Jobs.scarce..Employers.should.give.priority.to..nation..people.than.immigrants",
                  "Jobs.scarce..Men.should.have.more.right.to.a.job.than.women", "Woman.as.a.single.parent", 
                  "Men.make.better.political.leaders.than.women.do", 
                  "Aims.of.country..first.choice", "Most.important..first.choice",
                  "Willingness.to.fight.for.country","Self.positioning.in.political.scale","Income.equality",
                  "Private.vs.state.ownership.of.business","Government.responsibility",
                  "Political.system..Having.a.strong.leader","Political.system..Having.a.democratic.political.system",
                  "How.proud.of.nationality","Religious.person","Feeling.of.happiness")

# a vector of 4 possible behavioral DVs
possible_dvs_behave <- c("How.often.do.you.attend.religious.services","Family.savings.during.past.year",
                         "How.many.children.do.you.have","Political.action..signing.a.petition")


# this is a test
#matched_wals_wvs_interview$Tea.Num <- recode(matched_wals_wvs_interview$Tea,
#                                   `Other` = 0, .default = 1)
#matched_wals_wvs_interview$Politeness.Distinctions.in.Pronouns.Num <- recode(matched_wals_wvs_interview$Politeness.Distinctions.in.Pronouns,
#                                   `No politeness distinction` = 0, .default = 1)
#
#fit <- tidy(lm(scale(Self.positioning.in.political.scale) ~ scale(Politeness.Distinctions.in.Pronouns.Num) + factor(Employment.status) +
#            Sex + as.numeric(Age) + I(as.numeric(Age)^2) + factor(Marital.status) + factor(Scale.of.incomes) +
#            factor(Highest.educational.level.attained) + factor(Social.class..subjective.) +
#            factor(Country.region) + factor(Year.survey), data=matched_wals_wvs_interview))
#
#fit_interview <- tidy(lm(Self.positioning.in.political.scale ~ Politeness.Distinctions.in.Pronouns + factor(Employment.status) +
#            Sex + as.numeric(Age) + I(as.numeric(Age)^2) + factor(Marital.status) + factor(Scale.of.incomes) +
#            factor(Highest.educational.level.attained) + factor(Social.class..subjective.) +
#            factor(Country.region) + factor(Year.survey), data=matched_wals_wvs_interview))
#fit_home <- tidy(lm(Self.positioning.in.political.scale ~ Politeness.Distinctions.in.Pronouns + factor(Employment.status) +
#                      Sex + as.numeric(Age) + I(as.numeric(Age)^2) + factor(Marital.status) + factor(Scale.of.incomes) +
#                      factor(Highest.educational.level.attained) + factor(Social.class..subjective.) +
#                      factor(Country.region) + factor(Year.survey), data=matched_wals_wvs_home))
#fit <- tidy(lm(Political.system..Having.a.strong.leader ~ Indefinite.Pronouns + factor(Employment.status) +
#                      Sex + as.numeric(Age) + I(as.numeric(Age)^2) + factor(Marital.status) + factor(Scale.of.incomes) +
#                      factor(Highest.educational.level.attained) + factor(Social.class..subjective.) +
#                      factor(Country.region) + factor(Year.survey), data=matched_wals_wvs_home))


############### TABLES FOR BODY #############

setwd(paste0(path.name, "/output"))

# WVS DVS 
wvs_to_summarize <- read.csv(file=paste0(path.name, "/data/forR.csv"), na.strings=na_strings)

# Table 1, values
summary_wvs <- lapply(possible_dvs, function(x) {
  c(x, table(wvs_to_summarize[x]))
}
)
summary_wvs <- lapply(summary_wvs, `length<-`, max(lengths(summary_wvs)))
wvs_table <- data.frame(t(matrix(unlist(summary_wvs),nrow=max(lengths(summary_wvs)))))
summary_wvs_names <- lapply(possible_dvs, function(x) {
  c(x, names(table(wvs_to_summarize[x])))
}
)
summary_wvs_names <- lapply(summary_wvs_names, `length<-`, max(lengths(summary_wvs_names)))
wvs_table_names <- data.frame(t(matrix(unlist(summary_wvs_names),nrow=max(lengths(summary_wvs_names)))))
wvs_table_names$X1 <- gsub("[.]", " ", wvs_table_names$X1)
wvs_table <- rbind(wvs_table, wvs_table_names)
wvs_table <- arrange(wvs_table, X1, desc(X2))
wvs_table$X1 <- gsub("[.]", " ", wvs_table$X1)

# write.table(wvs_table, file="table1.csv", row.names=FALSE, sep=",")
xtable(data.frame(wvs_table_names$X1), type="latex")
print(xtable(data.frame(wvs_table_names$X1)), type="html",
      file="table1.html")


# Table 2, behaviors

summary_wvs <- lapply(possible_dvs_behave, function(x) {
  c(x, table(wvs_to_summarize[x]))
}
)
summary_wvs <- lapply(summary_wvs, `length<-`, max(lengths(summary_wvs)))
wvs_table <- data.frame(t(matrix(unlist(summary_wvs),nrow=max(lengths(summary_wvs)))))
summary_wvs_names <- lapply(possible_dvs_behave, function(x) {
  c(x, names(table(wvs_to_summarize[x])))
}
)
summary_wvs_names <- lapply(summary_wvs_names, `length<-`, max(lengths(summary_wvs_names)))
wvs_table_names <- data.frame(t(matrix(unlist(summary_wvs_names),nrow=max(lengths(summary_wvs_names)))))
wvs_table_names$X1 <- gsub("[.]", " ", wvs_table_names$X1)
wvs_table <- rbind(wvs_table, wvs_table_names)
wvs_table <- arrange(wvs_table, X1, desc(X2))
wvs_table$X1 <- gsub("[.]", " ", wvs_table$X1)

# write.table(wvs_table, file="table2.csv", row.names=FALSE, sep=",")
xtable(data.frame(wvs_table_names$X1), type="latex")
print(xtable(data.frame(wvs_table_names$X1)), type="html",
      file="table2.html")


############### TABLES FOR APPENDIX #############

# Table A1: WALS FEATURES
wals_vars <- unlist(xs)
data_to_summarize <- wals_data[wals_vars]

summary_wals <- lapply(seq(ncol(data_to_summarize)), function(x) {
  c(names(data_to_summarize[x]), names(table(data_to_summarize[x])), 
    data.frame(table(data_to_summarize[x]))$Freq)
}
)
wals_table <- data.frame(t(matrix(unlist(summary_wals),nrow=5)))
wals_table$X1 <- gsub("[.]", " ", wals_table$X1)
names(wals_table) <- c("Variable","Value 1","Value 2","N Languages (Value 1)", "N Languages (Value 2)")
# write.table(wals_table, file="tableA1.csv", row.names=FALSE)
xtable(wals_table, type="latex")
print(xtable(wals_table), type="html",
      file="tableA1.html")


# Table A2: MATCHED LANGUAGES

lang_table <- data.frame(table(matched_wals_wvs_home$Language.at.home[matched_wals_wvs_home$Language!=""] ))
names(lang_table) <- c("Language.at.home", "Frequency")
lang_table_merged <- merge(lang_table,xwalk,by="Language.at.home", all.x=TRUE)
lang_table_merged <- lang_table_merged[ , !(names(lang_table_merged) %in% c("Language.in.which.interview.was.conducted"))]
lang_table_merged <- lang_table_merged[lang_table_merged$Frequency!=0,]
names(lang_table_merged) <- c("Language (WVS Name)", "Frequency", "Language (WALS Name)")
row.names(lang_table_merged) <- 1:nrow(lang_table_merged)
#write.table(lang_table_merged, file="tableA2.csv", row.names=FALSE, sep=",")
xtable(lang_table_merged, type="latex")
print(xtable(lang_table_merged), type="html",
      file="tableA2.html")

lang_table_all <- data.frame(table(matched_wals_wvs_home$Language.at.home))
100 - 100*sum(lang_table$Frequency)/sum(lang_table_all$Freq)
nrow(lang_table_all)
nrow(lang_table_all) - nrow(lang_table_merged)




################ RUN ANALYSES ################

#  run the values analysis

to_run_home <- lapply(xs, function(ivs) {
  possible_models <- expand.grid(y = possible_dvs, x = ivs, 
                                                               stringsAsFactors = FALSE)
                possible_models
                possibly_lm = possibly(~ lm(.x, data = matched_wals_wvs_home), otherwise = NA)
                   models_actual <- possible_models %>% 
                   # Build a formula for each combination
                   mutate(formula = map2(x, y, ~ as.formula(paste(.y, "~ as.factor(", .x,") + factor(Employment.status) +
                                                                  Sex + as.numeric(Age) + I(as.numeric(Age)^2) + factor(Marital.status) + factor(Scale.of.incomes) +
                                                                  factor(Highest.educational.level.attained) + factor(Social.class..subjective.) +
                                                                  factor(Country.region) + factor(Year.survey)")))) %>% 
                   # Run a model for each formula
                   mutate(model = map(formula, possibly_lm)) %>% 
                   # Extract model elements
                   mutate(model_tidy = map(model, tidy))
                 res <- lapply(seq(nrow(possible_models)), function(x) c(models_actual$model_tidy[[x]]$estimate[2],
                                                                         models_actual$model_tidy[[x]]$statistic[2]))
                 res <- data.frame(t(matrix(unlist(res), ncol = nrow(possible_models), byrow = FALSE)))
                 
                 # combine them with y and x list
                 output <- cbind(possible_models, res) 
                 names(output) <- c("y","x","b","t")
                 output$t <- abs(output$t)
                 output
                 return(output)
                 rm(models_actual)
                 
  }               
)

# unlist and combine results into a single dataframe
results_home<-list.stack(to_run_home)
results_home
rm(to_run_home)

# here's how significant things are
percentile <- ecdf(results_home$t)
1-percentile(1.96)
1-percentile(5)

# save results
write.table(results_home, file=paste0(path.name, "/output/results_home.csv"), sep = ",", row.names=FALSE)
results_home <- read.csv(file=paste0(path.name, "/output/results_home.csv"), sep=",")
  
# relabel results
results_home$y <- recode(results_home$y, `Important.in.life..Family` = "Imp.: Family",
        `Important.in.life..Friends` = "Imp: Friends",
        `Important.in.life..Leisure.time` = "Imp: Leisure",
        `Important.in.life..Politics` = "Imp: Politics",
        `Important.in.life..Work` = "Imp: Work",
        `Important.in.life..Religion` = "Imp: Religion",
        `Interest.in.politics` = "Political Interest",
        `Most.people.can.be.trusted` = "Trust People",
        `How.much.freedom.of.choice.and.control` = "Free. v. Control",
        `Jobs.scarce..Employers.should.give.priority.to..nation..people.than.immigrants` = "Nation First",
        `Jobs.scarce..Men.should.have.more.right.to.a.job.than.women` = "Men First",
        `Woman.as.a.single.parent` = "Single Parent",
        `Men.make.better.political.leaders.than.women.do` = "Male Leaders",
        `Aims.of.country..first.choice` = "Strong Economy",
        `Most.important..first.choice` = "Stable Economy",
        `Willingness.to.fight.for.country` = "Willing to Fight",
        `Self.positioning.in.political.scale` = "Left-Right",
        `Income.equality` = "Inc. Inequality",
        `Private.vs.state.ownership.of.business` = "Bus. v. State",
        `Government.responsibility` = "Govt. Resp.",
        `Political.system..Having.a.strong.leader` = "Strong Leader",
        `Political.system..Having.a.democratic.political.system` = "Democracy",
        `How.proud.of.nationality` = "National Pride",
        `Religious.person` = "Religiosity",
        `Feeling.of.happiness` = "Happiness")

# calculate the median t-stat per dv
results_home <- results_home %>%
  group_by(y) %>%
  mutate(med_t = median(t))


# calculate the proportion of results that are significant at p < .01, and order
# dvs by that proportion
results_home <- results_home %>%
  group_by(y) %>%
  mutate(pct01 = length(t[t>2.576])/length(t))
results_home <- data.frame(results_home)
to.order <- aggregate(results_home, by=list(results_home$y),
                      FUN=mean, na.rm=TRUE)
to.order$Group.1 <- reorder(to.order$Group.1, -to.order$pct01)
results_home$orderedys <- results_home$y
results_home$orderedys <- factor(results_home$orderedys,      
                                 levels = names(table(to.order$Group.1)))  # Reordering group factor levels


# get effect sizes relative to SD of each DV
for.sd <- cbind(matched_wals_wvs_home[17:22],matched_wals_wvs_home[24:34],
                matched_wals_wvs_home[36:41],matched_wals_wvs_home[43:44])
melt.for.sd <- na.omit(melt(for.sd))
melt.for.sd$value <- as.numeric(melt.for.sd$value)

sds <- data.frame(
  melt.for.sd %>%
  dplyr::group_by(variable) %>%
  summarize(sd(value))
)


names(sds) <- c("y","sd")
sds$y <- recode(sds$y, `Important.in.life..Family` = "Imp.: Family",
                         `Important.in.life..Friends` = "Imp: Friends",
                         `Important.in.life..Leisure.time` = "Imp: Leisure",
                         `Important.in.life..Politics` = "Imp: Politics",
                         `Important.in.life..Work` = "Imp: Work",
                         `Important.in.life..Religion` = "Imp: Religion",
                         `Interest.in.politics` = "Political Interest",
                         `Most.people.can.be.trusted` = "Trust People",
                         `How.much.freedom.of.choice.and.control` = "Free. v. Control",
                         `Jobs.scarce..Employers.should.give.priority.to..nation..people.than.immigrants` = "Nation First",
                         `Jobs.scarce..Men.should.have.more.right.to.a.job.than.women` = "Men First",
                         `Woman.as.a.single.parent` = "Single Parent",
                         `Men.make.better.political.leaders.than.women.do` = "Male Leaders",
                         `Aims.of.country..first.choice` = "Strong Economy",
                         `Most.important..first.choice` = "Stable Economy",
                         `Willingness.to.fight.for.country` = "Willing to Fight",
                         `Self.positioning.in.political.scale` = "Left-Right",
                         `Income.equality` = "Inc. Inequality",
                         `Private.vs.state.ownership.of.business` = "Bus. v. State",
                         `Government.responsibility` = "Govt. Resp.",
                         `Political.system..Having.a.strong.leader` = "Strong Leader",
                         `Political.system..Having.a.democratic.political.system` = "Democracy",
                         `How.proud.of.nationality` = "National Pride",
                         `Religious.person` = "Religiosity",
                         `Feeling.of.happiness` = "Happiness")

results_home <- merge(results_home, sds, by="y")

results_home$size <- abs(results_home$b / results_home$sd)

# effect sizes
summary(results_home$size[results_home$t>2.576])

nrow(results_home[results_home$size < .2 & results_home$t>2.576,]) / 
  nrow(results_home[results_home$t>2.576,])


#  run the values analysis again, but using the language spoken at interview to match

to_run_interview <- lapply(xs, function(ivs) {
  possible_models <- expand.grid(y = possible_dvs, x = ivs, 
                                 stringsAsFactors = FALSE)
  possible_models
  possibly_lm = possibly(~ lm(.x, data = matched_wals_wvs_interview), otherwise = NA)
  models_actual <- possible_models %>% 
    # Build a formula for each combination
    mutate(formula = map2(x, y, ~ as.formula(paste(.y, "~ as.factor(", .x,") + factor(Employment.status) +
                                                   Sex + as.numeric(Age) + I(as.numeric(Age)^2) + factor(Marital.status) + factor(Scale.of.incomes) +
                                                   factor(Highest.educational.level.attained) + factor(Social.class..subjective.) +
                                                   factor(Country.region) + factor(Year.survey)")))) %>% 
    # Run a model for each formula
    mutate(model = map(formula, possibly_lm)) %>% 
    # Extract model elements
    mutate(model_tidy = map(model, tidy))
  res <- lapply(seq(nrow(possible_models)), function(x) c(models_actual$model_tidy[[x]]$estimate[2],
                                                          models_actual$model_tidy[[x]]$statistic[2]))
  res <- data.frame(t(matrix(unlist(res), ncol = nrow(possible_models), byrow = FALSE)))
  
  # combine them with y and x list
  output <- cbind(possible_models, res) 
  names(output) <- c("y","x","b","t")
  output$t <- abs(output$t)
  output
  return(output)
  rm(models_actual)
  
}               
    )

# unlist and combine results into a single dataframe
results_interview <- list.stack(to_run_interview)
results_interview
rm(to_run_interview)

# save results
write.table(results_interview, file=paste0(path.name, "/output/results_interview.csv"), sep = ",", row.names=FALSE)

# compare
home <- read.csv(file=paste0(path.name, "/output/results_home.csv"))
interview <- read.csv(file=paste0(path.name, "/output/results_interview.csv"))
compare <- merge(home, interview, by=c("y","x"), suffixes = c(".home",".interview"))
mean(compare$t.interview)
mean(compare$t.home)



##### run the behavioral analysis

to_run_behave <- lapply(xs, function(ivs) {
  possible_models <- expand.grid(y = possible_dvs_behave, x = ivs, 
                                 stringsAsFactors = FALSE)
  possible_models
  possibly_lm = possibly(~ lm(.x, data = matched_wals_wvs_home), otherwise = NA)
  models_actual <- possible_models %>% 
    # Build a formula for each combination
    mutate(formula = map2(x, y, ~ as.formula(paste(.y, "~ as.factor(", .x,") + factor(Employment.status) +
                                                   Sex + as.numeric(Age) + I(as.numeric(Age)^2) + factor(Marital.status) + factor(Scale.of.incomes) +
                                                   factor(Highest.educational.level.attained) + factor(Social.class..subjective.) +
                                                   factor(Country.region) + factor(Year.survey)")))) %>% 
    # Run a model for each formula
    mutate(model = map(formula, possibly_lm)) %>% 
    # Extract model elements
    mutate(model_tidy = map(model, tidy))
  res <- lapply(seq(nrow(possible_models)), function(x) c(models_actual$model_tidy[[x]]$estimate[2],
                                                          models_actual$model_tidy[[x]]$statistic[2]))
  res <- data.frame(t(matrix(unlist(res), ncol = nrow(possible_models), byrow = FALSE)))
  
  # combine them with y and x list
  output <- cbind(possible_models, res) 
  names(output) <- c("y","x","b","t")
  output$t <- abs(output$t)
  output
  return(output)
  rm(models_actual)
  
}               
    )

# unlist and combine results into a single dataframe
results_behave <- list.stack(to_run_behave)
results_behave
rm(to_run_behave)

# here's how significant things are
percentile <- ecdf(results_behave$t)
1-percentile(1.96)
1-percentile(5)

# save results
write.table(results_behave, file=paste0(path.name, "/output/results_behave.csv"), sep = ",", row.names=FALSE)
results_behave <- read.csv(file=paste0(path.name, "/output/results_behave.csv"), sep=",")

# relabel results
results_behave$y <- recode(results_behave$y, `How.often.do.you.attend.religious.services` = "Religious Attendance",
                    `Family.savings.during.past.year` = "Family Savings",
                    `How.many.children.do.you.have` = "Number of Children",
                    `Political.action..signing.a.petition` = "Ever Petition")

# calculate the median t-stat per dv
results_behave <- results_behave %>%
  group_by(y) %>%
  mutate(med_t = median(t))


# calculate the proportion of results that are significant at p < .01, and order
# dvs by that proportion
results_behave <- results_behave %>%
  group_by(y) %>%
  mutate(pct01 = length(t[t>2.576])/length(t))
results_behave <- data.frame(results_behave)
to.order <- aggregate(results_behave, by=list(results_behave$y),
                      FUN=mean, na.rm=TRUE)
to.order$Group.1 <- reorder(to.order$Group.1, -to.order$pct01)
results_behave$orderedys <- results_behave$y
results_behave$orderedys <- factor(results_behave$orderedys,      
                                 levels = names(table(to.order$Group.1)))  # Reordering group factor levels





################ NOW CREATE PLOTS ################


setwd(paste0(path.name, "/output"))

# all values results
p <- ggplot(data = results_home, mapping = aes(x=t))
p + geom_density(color="gray", fill="gray") +
  geom_vline(aes(xintercept=median(t))) + geom_vline(aes(xintercept=2.576), color="black", linetype="dashed") +
  theme_classic() +   xlab("|t-statistic| of linguistic feature variable") + ylab("Density") +
  annotate("text", x=c(10,10), y=c(.13,.15), label=c("median |t-statistic|", "t = 2.576"), color=c("black","black")) +
  annotate("segment", x=5, xend=3.5, y=.13, yend=.13, arrow=arrow(length=unit(.05, "inches"))) + 
  annotate("segment", x=5, xend=2.8, y=.15, yend=.15, color="black", arrow=arrow(length=unit(.05, "inches")))
p_out <- p + geom_density(color="gray", fill="gray") +
  geom_vline(aes(xintercept=median(t))) + geom_vline(aes(xintercept=2.576), color="black", linetype="dashed") +
  theme_classic() +   xlab("|t-statistic| of linguistic feature variable") + ylab("Density") +
  annotate("text", x=c(10,10), y=c(.13,.15), label=c("median |t-statistic|", "t = 2.576"), color=c("black","black")) +
  annotate("segment", x=6, xend=3.5, y=.13, yend=.13, arrow=arrow(length=unit(.05, "inches"))) + 
  annotate("segment", x=6, xend=2.8, y=.15, yend=.15, color="black", arrow=arrow(length=unit(.05, "inches")))
ggsave("figure1.pdf", plot=p_out, height=3, width=6, units="in")

# values results by DVs
p + geom_density(color="gray", fill="gray") + ylab("Density") +
  theme_classic() + facet_wrap(~orderedys, ncol = 5) + 
  geom_vline(aes(xintercept=med_t)) + geom_vline(aes(xintercept=2.576), color="black", linetype="dashed")  + 
  xlab("|t-statistic| of linguistic feature variable") + 
  theme(strip.background = element_rect(colour="white", fill="white"))
p_out <- p + geom_density(color="gray", fill="gray") + ylab("Density") +
  theme_classic() + facet_wrap(~orderedys, ncol = 5) + 
  geom_vline(aes(xintercept=med_t)) + geom_vline(aes(xintercept=2.576), color="black", linetype="dashed")  + 
  xlab("|t-statistic| of linguistic feature variable") + 
  theme(strip.background = element_rect(colour="white", fill="white"))
ggsave("figure2.pdf", plot=p_out, height=4, width=6, units="in")

# behavioral results by DVs
p <- ggplot(data = results_behave, mapping = aes(x=t))
p + geom_density(color="gray", fill="gray") + ylab("Density") +
  theme_classic() + facet_wrap(~orderedys, ncol = 2) + 
  geom_vline(aes(xintercept=med_t)) + geom_vline(aes(xintercept=2.576), color="black", linetype="dashed")  + 
  xlab("|t-statistic| of linguistic feature variable") + 
  theme(strip.background = element_rect(colour="white", fill="white"))
p_out <- p + geom_density(color="gray", fill="gray") + ylab("Density") +
  theme_classic() + facet_wrap(~orderedys, ncol = 2) + 
  geom_vline(aes(xintercept=med_t)) + geom_vline(aes(xintercept=2.576), color="black", linetype="dashed")  + 
  xlab("|t-statistic| of linguistic feature variable") + 
  theme(strip.background = element_rect(colour="white", fill="white"))
ggsave("figure3.pdf", plot=p_out, height=2.5, width=6, units="in")


# APPENDIX FIGURE A1
# comparison of language at home and language of interview
p <- ggplot(data = compare, mapping = aes(x=t.home,y=t.interview))
p + geom_point(shape=1, color="black", fill="white") + theme_classic() + 
  geom_hline(aes(yintercept=2.576), color="black", linetype="dashed") + geom_vline(aes(xintercept=2.576), color="black", linetype="dashed") +
  ylab("|t-statistic| using Language of Interview") + xlab("|t-statistic| using Language at Home") +
  scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x), 
                labels = scales::trans_format("log10", scales::math_format(10^.x))) +
  scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
                labels = scales::trans_format("log10", scales::math_format(10^.x)))
p_out <- p + geom_point(shape=1, color="black", fill="white") + theme_classic() + 
  geom_hline(aes(yintercept=2.576), color="black", linetype="dashed") + geom_vline(aes(xintercept=2.576), color="black", linetype="dashed") +
  ylab("|t-statistic| using Language of Interview (Log scale)") + xlab("|t-statistic| using Language at Home (Log scale)") +
  scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x), 
                labels = scales::trans_format("log10", scales::math_format(10^.x))) +
  scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
                labels = scales::trans_format("log10", scales::math_format(10^.x)))
ggsave("figureA1.pdf", plot=p_out, height=4, width=6, units="in")



################ AND HERE IS HOW WE FIX IT ################


setwd(paste0(path.name, "/output"))

results_to_fix <- rbind(read.csv("results_home.csv"),read.csv("results_behave.csv"))

most_sig <- head(arrange(results_to_fix, -t), 50)
to_fix <- most_sig[,1:2]


# twoway clustering

models_fixed <- to_fix %>% 
  # Build a formula for each combination
  mutate(formula = map2(x, y, ~ as.formula(paste(.y, "~ as.factor(", .x,") + factor(Employment.status) +
                                                 Sex + as.numeric(Age) + I(as.numeric(Age)^2) + factor(Marital.status) + factor(Scale.of.incomes) +
                                                 factor(Highest.educational.level.attained) + factor(Social.class..subjective.) +
                                                 factor(Country.region) + factor(Year.survey)" )))) %>% 
  # Run a model for each formula
  mutate(model = map(formula, ~ lm(.x, data = matched_wals_wvs_home))) %>% 
  # extract s.e.
  mutate(vmat = map(model, vcovCL, cluster = ~ Country.region + Year.survey, multi0 = TRUE)) 


totidy <- lapply(seq(nrow(most_sig)), function(x) coeftest(models_fixed$model[[x]], models_fixed$vmat[[x]]))
tidied <- map(totidy, tidy)
tidied_res <- lapply(seq(nrow(most_sig)), function(x) c(tidied[[x]]$estimate[2],
                                                        tidied[[x]]$statistic[2]))
tidied_res <- data.frame(t(matrix(unlist(tidied_res), ncol = nrow(to_fix), byrow = FALSE)))
tidied_res <- cbind(to_fix, tidied_res) 
names(tidied_res) <- c("y","x","b","t")
tidied_res$t <- abs(tidied_res$t)

tidied_res$Method <- "Clustered"
tidied_res$Model <- seq(1:nrow(tidied_res))
most_sig$Method <- "Standard"
most_sig$Model <- seq(1:nrow(tidied_res))
combined <- rbind(most_sig, tidied_res)

# not run: to compare cluster results to unadjusted results
#fixplot <- ggplot(data=combined, aes(x = Model, y=t, fill=Method))
#fixplot + coord_flip() + geom_bar(stat="identity", position=position_dodge()) + 
#    scale_fill_manual(values=c("darkblue","lightblue")) + 
#    ylab("|t-statistic|") + geom_hline(aes(yintercept=2.576), color="red") + theme_classic() +
#    annotate("text", x=18, y=20, label="t = 2.576", color="red") +
#    annotate("segment", x=17.9, xend=17.9, y=15.5, yend=3, color="red", arrow=arrow(length=unit(.05, "inches"))) +
#  theme(legend.position = c(0.85, 0.8), axis.text.y=element_blank())
#fixplot_out <- fixplot + coord_flip() + geom_bar(stat="identity", position=position_dodge()) + 
#  scale_fill_manual(values=c("darkblue","lightblue")) + 
#  ylab("|t-statistic|") + geom_hline(aes(yintercept=2.576), color="red") + theme_classic() +
#  annotate("text", x=18, y=20, label="t = 2.576", color="red") +
#  annotate("segment", x=17.9, xend=17.9, y=15.5, yend=3, color="red", arrow=arrow(length=unit(.05, "inches"))) +
#  theme(legend.position = c(0.85, 0.8), axis.text.y=element_blank())
#ggsave("cluster_results.pdf", plot=fixplot_out, height=4, width=4, units="in")


# multilevel modeling

models_fixed_multi <- to_fix %>% 
  # Build a formula for each combination
  mutate(formula = map2(x, y, ~ as.formula(paste(.y, "~ as.factor(", .x,") + factor(Employment.status) +
                                                 Sex + as.numeric(Age) + I(as.numeric(Age)^2) + factor(Marital.status) + factor(Scale.of.incomes) +
                                                 factor(Highest.educational.level.attained) + factor(Social.class..subjective.) +
                                                 factor(Country.region) + factor(Year.survey) + 
                                                (1 | Language.at.home)")))) %>% 
  # Run a model for each formula
  mutate(model = map(formula, ~ lmer(.x, data = matched_wals_wvs_home))) %>% 
  # Extract model elements
  mutate(model_tidy = map(model, tidy))


multi_res <- lapply(seq(nrow(most_sig)), function(x) c(models_fixed_multi$model_tidy[[x]]$estimate[2],
                                                    models_fixed_multi$model_tidy[[x]]$statistic[2]))
multi_res <- data.frame(t(matrix(unlist(multi_res), ncol = nrow(to_fix), byrow = FALSE)))
multi_res <- cbind(to_fix, multi_res) 
names(multi_res) <- c("y","x","b","t")
multi_res$t <- abs(multi_res$t)


multi_res$Method <- "Multilevel"
multi_res$Model <- seq(1:nrow(multi_res))
most_sig$Method <- "Standard"
most_sig$Model <- seq(1:nrow(multi_res))
combined_multi <- rbind(most_sig, multi_res)

# not run: to compare MLM results to unadjusted results
#fixplot_multi <- ggplot(data=combined_multi, aes(x = Model, y=t, fill=Method))
#fixplot_multi + coord_flip() + geom_bar(stat="identity", position=position_dodge()) + 
#  scale_fill_manual(values=c("darkblue","lightblue")) + 
#  ylab("|t-statistic|") + geom_hline(aes(yintercept=2.576), color="red") + theme_classic() +
#  annotate("text", x=18, y=20, label="t = 2.576", color="red") +
#  annotate("segment", x=17.9, xend=17.9, y=15.5, yend=3, color="red", arrow=arrow(length=unit(.05, "inches"))) +
#  theme(legend.position = c(0.85, 0.8), axis.text.y=element_blank())
#fixplot_multi_out <- fixplot_multi + coord_flip() + geom_bar(stat="identity", position=position_dodge()) + 
#  scale_fill_manual(values=c("darkblue","lightblue")) + 
#  ylab("|t-statistic|") + geom_hline(aes(yintercept=2.576), color="red") + theme_classic() +
#  annotate("text", x=18, y=20, label="t = 2.576", color="red") +
#  annotate("segment", x=17.9, xend=17.9, y=15.5, yend=3, color="red", arrow=arrow(length=unit(.05, "inches"))) +
#  theme(legend.position = c(0.85, 0.8), axis.text.y=element_blank())
#ggsave("multi_results.pdf", plot=fixplot_multi_out, height=4, width=4, units="in")


# some descriptive statistics on the three sets of results
min(most_sig$t)
median(most_sig$t)
median(na.omit(tidied_res$t))
median(multi_res$t)

format(dt(min(most_sig$t),Inf))
median(na.omit(tidied_res$t))
median(multi_res$t)


### CREATE FIGURE 4

fixed_to_plot <- data.frame(cbind(seq(1:nrow(most_sig)),most_sig$t,tidied_res$t,multi_res$t))
names(fixed_to_plot) <- c("Mod","Standard","Clustered","Multilevel")           
fixed_to_plot <- melt(fixed_to_plot,id=c("Mod","Standard"))
names(fixed_to_plot) <- c("Mod","Standard","Method","Fixed")           

fixed_plot <- ggplot(data=fixed_to_plot, aes(x = Fixed, y=Standard, color=Method))
fixed_plot + geom_point()  + theme_classic() + 
  geom_vline(aes(xintercept=2.576), linetype="dashed") + theme(legend.position = c(0.9, 0.7)) + 
  scale_color_manual(values=c("black", "gray")) +
  annotate("text", x=8, y=21, label="t = 2.576") +
  annotate("segment", x=6, xend=3, y=21, yend=21, arrow=arrow(length=unit(.05, "inches"))) +
  ylab("|t-statistic|, Unadjusted Models") + xlab("|t-statistic|, Adjusted Models")
fixed_plot_out <- fixed_plot + geom_point()  + theme_classic() + 
  geom_vline(aes(xintercept=2.576), linetype="dashed") + theme(legend.position = c(0.9, 0.8)) + 
  scale_color_manual(values=c("black", "gray")) +
  annotate("text", x=4, y=21, label="t = 2.576") +
  annotate("segment", x=3.3, xend=2.7, y=21, yend=21, arrow=arrow(length=unit(.05, "inches"))) +
  ylab("|t-statistic|, Unadjusted Models") + xlab("|t-statistic|, Adjusted Models")
ggsave("figure4.pdf", plot=fixed_plot_out, height=3, width=5, units="in")



######## APPENDIX C: SIMULATIONS, ETC. ###################

set.seed(14850)
lang.list <- names(table(matched_wals_wvs_home$Language.at.home))

to_simulate <- function() {
  
  # assign 1/0 variable to all languages at random
  random.features <- data.frame(cbind(lang.list,rbinom(length(lang.list),1,.5)))
  names(random.features) <- c("Language.at.home","random.feature")
  matched_wals_wvs_home_random <- merge(random.features, matched_wals_wvs_home, all.x=TRUE, by="Language.at.home")
  
  # randomly choose a DV
  random.dv <- sample(c(18:23,25:35,37:42,44:45),1)
  matched_wals_wvs_home_random$to.fit <- unlist(matched_wals_wvs_home_random[random.dv])
  
  # a function that estimates each model
  toreturn <- tryCatch(
    {
      fit.lm <- lm(to.fit ~ random.feature + factor(Social.class..subjective.) + factor(Employment.status) +
                     Sex + as.numeric(Age) + I(as.numeric(Age)^2) + factor(Marital.status) + factor(Scale.of.incomes) +
                     factor(Highest.educational.level.attained)  +
                     factor(Country.region) + factor(Year.survey), data=matched_wals_wvs_home_random)
      vcov_twoway <- cluster.vcov(fit.lm, cbind(matched_wals_wvs_home_random$Country.region, matched_wals_wvs_home_random$Year.survey))
      cl2 <- coeftest(fit.lm, vcov_twoway)
      fit.lmer <- lmer(to.fit ~ random.feature + factor(Social.class..subjective.) + factor(Employment.status) +
                         Sex + as.numeric(Age) + I(as.numeric(Age)^2) + factor(Marital.status) + factor(Scale.of.incomes) +
                         factor(Highest.educational.level.attained)  +
                         factor(Country.region) + factor(Year.survey) + (1 | Language.at.home), data=matched_wals_wvs_home_random)
      
      c(summary(fit.lm)$coefficients[2,3],
        cl2[2,3],
        summary(fit.lmer)$coefficients[2,3])
    },
    error=function(cond) {
      c(NA,NA,NA)
    }
  )
  
  return(toreturn)
}

# run the simulation
reps <- replicate(250,to_simulate())

# organize the results and check how significant
results <- na.omit(data.frame(t(reps)))
length(results[,1][abs(results[,1])>2.576])/length(results[,1])
length(results[,2][abs(results[,2])>2.576])/length(results[,2])
length(results[,3][abs(results[,3])>2.576])/length(results[,3])

# create Figure C1
to.plot <- melt(results)
to.plot$value <- abs(to.plot$value)
p <- ggplot(to.plot,aes(x=value, fill=variable)) 
p + geom_density(alpha=0.75, color=NA) + 
  geom_vline(aes(xintercept=2.576), linetype="dashed") + theme_classic() + 
  xlab("|t-statistic| of randomly generated linguistic feature") + ylab("Density") +
  annotate("text", x=c(6.2), y=c(.4), label=c("t = 2.576")) +
  scale_fill_manual(values = c("black", "gray50", "gray80"), 
                    name = "Method", labels = c("OLS with Fixed Effects", "Twoway Clustered SEs", "Multilevel Model")) + 
  theme(legend.position = c(0.8, 0.5)) +
  annotate("segment", x=5, xend=2.8, y=.4, yend=.4, arrow=arrow(length=unit(.05, "inches")))
p_out <- p + geom_density(alpha=0.75, color=NA) + 
  geom_vline(aes(xintercept=2.576), linetype="dashed") + theme_classic() + 
  xlab("|t-statistic| of randomly generated linguistic feature") + ylab("Density") +
  annotate("text", x=c(6.2), y=c(.4), label=c("t = 2.576")) +
  scale_fill_manual(values = c("black", "gray50", "gray80"), 
                    name = "Method", labels = c("OLS with Fixed Effects", "Twoway Clustered SEs", "Multilevel Model")) + 
  theme(legend.position = c(0.8, 0.5)) +
  annotate("segment", x=5, xend=2.8, y=.4, yend=.4, arrow=arrow(length=unit(.05, "inches")))
ggsave("figureC1.pdf", plot=p_out, height=3, width=6, units="in")



##### THESE TABLES CHECK TO IF ESTIMATES THAT WE THINK SHOULD BE SIGNIFICANT ACTUALLY ARE

# TABLE C1: saving

tidyfit <- tidy(lm(Family.savings.during.past.year ~ factor(Employment.status) + factor(Scale.of.incomes)  +
                     factor(Social.class..subjective.) + Politeness.Distinctions.in.Pronouns + 
                     Sex + as.numeric(Age) + I(as.numeric(Age)^2) + factor(Marital.status) + factor(Highest.educational.level.attained) +
                     factor(Country.region) + factor(Year.survey), data=matched_wals_wvs_home))
fit <- lm(Family.savings.during.past.year ~ factor(Employment.status) + factor(Scale.of.incomes)  +
            factor(Social.class..subjective.) + Politeness.Distinctions.in.Pronouns + 
            Sex + as.numeric(Age) + I(as.numeric(Age)^2) + factor(Marital.status) + factor(Highest.educational.level.attained) +
            factor(Country.region) + factor(Year.survey), data=matched_wals_wvs_home)
fit.lmer <- lmer(Family.savings.during.past.year ~ factor(Employment.status) + factor(Scale.of.incomes)  +
                   factor(Social.class..subjective.) + Politeness.Distinctions.in.Pronouns + 
                   Sex + as.numeric(Age) + I(as.numeric(Age)^2) + factor(Marital.status) + factor(Highest.educational.level.attained) +
                   factor(Country.region) + factor(Year.survey) + (1 | Language.at.home), data=matched_wals_wvs_home)

vcov_oneway <- cluster.vcov(fit, matched_wals_wvs_home$Country.region)
vcov_twoway <- cluster.vcov(fit, cbind(matched_wals_wvs_home$Country.region, matched_wals_wvs_home$Year.survey))

stargazer.labels <- c("Housewife","Other","Part-Time","Retired","Self-Employed","Student","Unemployed")
stargazer.columns <- c("OLS", "Twoway Clustered SEs", "Multilevel Model")
stargazer(fit, coeftest(fit, vcov_twoway), fit.lmer, 
          keep=c("Employment.status"), covariate.labels = stargazer.labels, model.names=FALSE,
          dep.var.labels.include=FALSE, dep.var.caption="", 
          no.space=TRUE, star.cutoffs = c(0.05, 0.01, 0.001),
          column.labels = stargazer.columns, keep.stat = c("n"), label="table:c_t1",
          out="tableC1.tex")
stargazer(fit, coeftest(fit, vcov_twoway), fit.lmer, 
          keep=c("Employment.status"), covariate.labels = stargazer.labels, model.names=FALSE,
          dep.var.labels.include=FALSE, dep.var.caption="", 
          no.space=TRUE, star.cutoffs = c(0.05, 0.01, 0.001),
          column.labels = stargazer.columns, keep.stat = c("n"), label="table:c_t1",
          out="tableC1.html")


# TABLE C2: social class and left-right positioning

tidyfit <- tidy(lm(Self.positioning.in.political.scale ~ factor(Social.class..subjective.) + Politeness.Distinctions.in.Pronouns + factor(Employment.status) +
                     Sex + as.numeric(Age) + I(as.numeric(Age)^2) + factor(Marital.status) + factor(Scale.of.incomes) +
                     factor(Highest.educational.level.attained)  +
                     factor(Country.region) + factor(Year.survey), data=matched_wals_wvs_home))
fit <- lm(Self.positioning.in.political.scale ~ factor(Social.class..subjective.) + Politeness.Distinctions.in.Pronouns + factor(Employment.status) +
            Sex + as.numeric(Age) + I(as.numeric(Age)^2) + factor(Marital.status) + factor(Scale.of.incomes) +
            factor(Highest.educational.level.attained)  +
            factor(Country.region) + factor(Year.survey), data=matched_wals_wvs_home)
fit.lmer <- lmer(Self.positioning.in.political.scale ~ factor(Social.class..subjective.) + Politeness.Distinctions.in.Pronouns + factor(Employment.status) +
                   Sex + as.numeric(Age) + I(as.numeric(Age)^2) + factor(Marital.status) + factor(Scale.of.incomes) +
                   factor(Highest.educational.level.attained)  +
                   factor(Country.region) + factor(Year.survey) + (1 | Language.at.home), data=matched_wals_wvs_home)

vcov_oneway <- cluster.vcov(fit, matched_wals_wvs_home$Country.region)
vcov_twoway <- cluster.vcov(fit, cbind(matched_wals_wvs_home$Country.region, matched_wals_wvs_home$Year.survey))


stargazer.labels <- c("Lower middle class","Upper class","Upper  middle class","Working class")
stargazer.columns <- c("OLS", "Twoway Clustered SEs", "Multilevel Model")
stargazer(fit, coeftest(fit, vcov_twoway), fit.lmer, 
          keep=c("Social.class..subjective"), covariate.labels = stargazer.labels, model.names=FALSE,
          dep.var.labels.include=FALSE, dep.var.caption="",
          no.space=TRUE, star.cutoffs = c(0.05, 0.01, 0.001),
          column.labels = stargazer.columns, keep.stat = c("n"), label="table:c_t2",
          out="tableC2.tex")
stargazer(fit, coeftest(fit, vcov_twoway), fit.lmer, 
          keep=c("Social.class..subjective"), covariate.labels = stargazer.labels, model.names=FALSE,
          dep.var.labels.include=FALSE, dep.var.caption="",
          no.space=TRUE, star.cutoffs = c(0.05, 0.01, 0.001),
          column.labels = stargazer.columns, keep.stat = c("n"), label="table:c_t2",
          out="tableC2.html")


# TABLE C3: incomes and preferences for inequality

matched_wals_wvs_home <- within(matched_wals_wvs_home, Scale.of.incomes <- factor(Scale.of.incomes))
matched_wals_wvs_home <- within(matched_wals_wvs_home, Scale.of.incomes <- relevel(Scale.of.incomes, ref = "Lower step"))

tidyfit <- tidy(lm(Income.equality ~ factor(Scale.of.incomes)  +
                     factor(Social.class..subjective.) + Politeness.Distinctions.in.Pronouns + factor(Employment.status) +
                     Sex + as.numeric(Age) + I(as.numeric(Age)^2) + factor(Marital.status) + factor(Highest.educational.level.attained) +
                     factor(Country.region) + factor(Year.survey), data=matched_wals_wvs_home))
fit <- lm(Income.equality ~ factor(Scale.of.incomes)  +
            factor(Social.class..subjective.) + Politeness.Distinctions.in.Pronouns + factor(Employment.status) +
            Sex + as.numeric(Age) + I(as.numeric(Age)^2) + factor(Marital.status) + factor(Highest.educational.level.attained) +
            factor(Country.region) + factor(Year.survey), data=matched_wals_wvs_home)
fit.lmer <- lmer(Income.equality ~ factor(Scale.of.incomes)  +
                   factor(Social.class..subjective.) + Politeness.Distinctions.in.Pronouns + factor(Employment.status) +
                   Sex + as.numeric(Age) + I(as.numeric(Age)^2) + factor(Marital.status) + factor(Highest.educational.level.attained) +
                   factor(Country.region) + factor(Year.survey) + (1 | Language.at.home), data=matched_wals_wvs_home)

vcov_oneway <- cluster.vcov(fit, matched_wals_wvs_home$Country.region)
vcov_twoway <- cluster.vcov(fit, cbind(matched_wals_wvs_home$Country.region, matched_wals_wvs_home$Year.survey))

stargazer.labels <- c("Level 2","Level 3","Level 4","Level 5","Level 6","Level 7","Level 8","Level 9","Level 10")
stargazer.columns <- c("OLS", "Twoway Clustered SEs", "Multilevel Model")
stargazer(fit, coeftest(fit, vcov_twoway), fit.lmer, 
          keep=c("Scale.of.incomes"), model.names=FALSE,
          dep.var.labels.include=FALSE, dep.var.caption="",
          column.labels = stargazer.columns, keep.stat = c("n"),
          order=c(5,9,3,2,7,6,1,4,8), covariate.labels = stargazer.labels, 
          no.space=TRUE, star.cutoffs = c(0.05, 0.01, 0.001),
          label="table:c_t3", 
          out="tableC3.tex")
stargazer(fit, coeftest(fit, vcov_twoway), fit.lmer, 
          keep=c("Scale.of.incomes"), model.names=FALSE,
          dep.var.labels.include=FALSE, dep.var.caption="",
          column.labels = stargazer.columns, keep.stat = c("n"),
          order=c(5,9,3,2,7,6,1,4,8), covariate.labels = stargazer.labels, 
          no.space=TRUE, star.cutoffs = c(0.05, 0.01, 0.001),
          label="table:c_t3", 
          out="tableC3.html")




